home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Aminet 1 (Walnut Creek)
/
Aminet - June 1993 [Walnut Creek].iso
/
aminet
/
text
/
tex
/
tgrind.lha
/
TGrind
/
regexp.c
< prev
next >
Wrap
C/C++ Source or Header
|
1993-01-04
|
14KB
|
596 lines
#ifndef lint
static char *sccsid="@(#)regexp.c 1.2 (LBL) 4/12/85";
#endif
/*
* regular expression matching routines for tgrind/tfontedpr.
*
* These routines were written by Dave Presotto (I think) for vgrind.
* Minor mods & attempts to improve performance by Van Jacobson (van@lbl-rtsg)
* and Chris Torek (chris@maryland).
*
* Modifications.
* --------------
* 30Mar85 Van & Chris Changed expmatch to return pointer to start of what
* was matched in addition to pointer to match end.
* Several changes to improve performance (too numerous
* to mention).
* 11Dec84 Dave Presotto Written.
*/
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifndef boolean
typedef int boolean;
#define TRUE 1
#define FALSE 0
#endif
#define NIL 0
#define makelower(c) (isupper((c)) ? tolower((c)) : (c))
extern int (*re_strncmp)(char *, char *, size_t); /* function used by expmatch to compare
* strings. The caller should make it point to
* strncmp if case is significant &
* lc_strncmp otherwise.
*/
void expconv(void);
/* lc_strncmp - like strncmp except that we convert the
* first string to lower case before comparing.
*/
int lc_strncmp(char *s1, char *s2, size_t len)
{
while (len-- > 0)
if (*s2 - makelower(*s1))
return(1);
else
s2++, s1++;
return(0);
}
/* The following routine converts an irregular expression to
* internal format.
*
* Either meta symbols (\a \d or \p) or character strings or
* operations ( alternation or perenthesizing ) can be
* specified. Each starts with a descriptor byte. The descriptor
* byte has STR set for strings, META set for meta symbols
* and OPER set for operations.
* The descriptor byte can also have the OPT bit set if the object
* defined is optional. Also ALT can be set to indicate an alternation.
*
* For metasymbols the byte following the descriptor byte identities
* the meta symbol (containing an ascii 'a', 'd', 'p', '|', or '('). For
* strings the byte after the descriptor is a character count for
* the string:
*
* meta symbols := descriptor
* symbol
*
* strings := descriptor
* character count
* the string
*
* operatins := descriptor
* symbol
* character count
*/
/*
* handy macros for accessing parts of match blocks
*/
#define MSYM(A) (*(A+1)) /* symbol in a meta symbol block */
#define MNEXT(A) (A+2) /* character following a metasymbol block */
#define OSYM(A) (*(A+1)) /* symbol in an operation block */
#define OCNT(A) (*(A+2)) /* character count */
#define ONEXT(A) (A+3) /* next character after the operation */
#define OPTR(A) (A+*(A+2)) /* place pointed to by the operator */
#define SCNT(A) (*(A+1)) /* byte count of a string */
#define SSTR(A) (A+2) /* address of the string */
#define SNEXT(A) (A+2+*(A+1)) /* character following the string */
/*
* bit flags in the descriptor
*/
#define OPT 1
#define STR 2
#define META 4
#define ALT 8
#define OPER 16
char *ure; /* pointer current position in unconverted exp */
char *ccre; /* pointer to current position in converted exp*/
char * convexp(char *re) /* unconverted irregular expression */
{
register char *cre; /* pointer to converted regular expression */
/* allocate room for the converted expression */
if (re == NIL)
return (NIL);
if (*re == '\0')
return (NIL);
cre = malloc (4 * strlen(re) + 3);
ccre = cre;
ure = re;
/* start the conversion with a \a */
*cre = META | OPT;
MSYM(cre) = 'a';
ccre = MNEXT(cre);
/* start the conversion (its recursive) */
expconv ();
*ccre = 0;
return (cre);
}
void expconv(void)
{
register char *cs; /* pointer to current symbol in converted exp */
register char c; /* character being processed */
register char *acs; /* pinter to last alternate */
register int temp;
/* let the conversion begin */
acs = NIL;
cs = NIL;
while (*ure != NIL) {
switch (c = *ure++) {
case '\\':
switch (c = *ure++) {
/* escaped characters are just characters */
default:
if (cs == NIL || (*cs & STR) == 0) {
cs = ccre;
*cs = STR;
SCNT(cs) = 1;
ccre += 2;
} else
SCNT(cs)++;
*ccre++ = c;
break;
/* normal(?) metacharacters */
case 'a':
case 'd':
case 'e':
case 'p':
if (acs != NIL && acs != cs) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NIL;
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
break;
}
break;
/* just put the symbol in */
case '^':
case '$':
if (acs != NIL && acs != cs) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NIL;
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
break;
/* mark the last match sequence as optional */
case '?':
if (cs)
*cs = *cs | OPT;
break;
/* recurse and define a subexpression */
case '(':
if (acs != NIL && acs != cs) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NIL;
}
cs = ccre;
*cs = OPER;
OSYM(cs) = '(';
ccre = ONEXT(cs);
expconv ();
OCNT(cs) = ccre - cs; /* offset to next symbol */
break;
/* return from a recursion */
case ')':
if (acs != NIL) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NIL;
}
cs = ccre;
*cs = META;
MSYM(cs) = c;
ccre = MNEXT(cs);
return;
/* mark the last match sequence as having an alternate */
/* the third byte will contain an offset to jump over the */
/* alternate match in case the first did not fail */
case '|':
if (acs != NIL && acs != cs)
OCNT(ccre) = ccre - acs; /* make a back pointer */
else
OCNT(ccre) = 0;
*cs |= ALT;
cs = ccre;
*cs = OPER;
OSYM(cs) = '|';
ccre = ONEXT(cs);
acs = cs; /* remember that the pointer is to be filles */
break;
/* if its not a metasymbol just build a scharacter string */
default:
if (cs == NIL || (*cs & STR) == 0) {
cs = ccre;
*cs = STR;
SCNT(cs) = 1;
ccre = SSTR(cs);
} else
SCNT(cs)++;
*ccre++ = c;
break;
}
}
if (acs != NIL) {
do {
temp = OCNT(acs);
OCNT(acs) = ccre - acs;
acs -= temp;
} while (temp != 0);
acs = NIL;
}
return;
}
/* end of convertre */
/*
* The following routine recognises an irregular expresion
* with the following special characters:
*
* \? - means last match was optional
* \a - matches any number of characters
* \d - matches any number of spaces and tabs
* \p - matches any number of alphanumeric
* characters. The
* characters matched will be copied into
* the area pointed to by 'name'.
* \| - alternation
* \( \) - grouping used mostly for alternation and
* optionality
*
* The irregular expression must be translated to internal form
* prior to calling this routine
*
* The value returned is the pointer to the last character matched.
* If strtptr is non-null, a pointer to the start of the string matched
* (excluding \a matches) will be returned at *strtptr.
* If mstring is non-null, the string matched by a \p will be copied
* into mstring.
*/
boolean _escaped; /* true if we are currently _escaped */
char *_start; /* start of string */
char *expmatch (char *s, char *re, char **strtptr, char *mstring)
/* register char *s; string to check for a match in */
/* register char *re; a converted irregular expression */
/* char **strtptr; where to put ptr to start of match */
/* char *mstring; where to put whatever matches a \p */
{
register char *cs; /* the current symbol */
register char *ptr, *s1; /* temporary pointer */
register char c; /* temporary */
boolean matched; /* a temporary boolean */
/* initial conditions */
if ( strtptr )
*strtptr = NIL;
if (re == NIL)
return (NIL);
cs = re;
matched = FALSE;
/* loop till expression string is exhausted (or at least pretty tired) */
while (*cs) {
switch (*cs & (OPER | STR | META)) {
/* try to match a string */
case STR:
matched = !((*re_strncmp)(s, SSTR(cs), SCNT(cs)));
if (matched) {
/* hoorah it matches */
s += SCNT(cs);
cs = SNEXT(cs);
} else if (*cs & ALT) {
/* alternation, skip to next expression */
cs = SNEXT(cs);
} else if (*cs & OPT) {
/* the match is optional */
cs = SNEXT(cs);
matched = 1; /* indicate a successful match */
} else {
/* no match, error return */
return (NIL);
}
break;
/* an operator, do something fancy */
case OPER:
switch (OSYM(cs)) {
/* this is an alternation */
case '|':
if (matched)
/* last thing in the alternation was a match, skip ahead */
cs = OPTR(cs);
else
/* no match, keep trying */
cs = ONEXT(cs);
break;
/* this is a grouping, recurse */
case '(':
ptr = expmatch (s, ONEXT(cs), strtptr, mstring);
if (ptr != NIL) {
/* the subexpression matched */
matched = 1;
s = ptr;
} else if (*cs & ALT) {
/* alternation, skip to next expression */
matched = 0;
} else if (*cs & OPT) {
/* the match is optional */
matched = 1; /* indicate a successful match */
} else {
/* no match, error return */
return (NIL);
}
cs = OPTR(cs);
break;
}
break;
/* try to match a metasymbol */
case META:
switch (MSYM(cs)) {
/* try to match anything and remember what was matched */
case 'p':
/*
* This is really the same as trying the match the
* remaining parts of the expression to any subset
* of the string.
*/
s1 = s;
do {
ptr = expmatch (s1, MNEXT(cs), strtptr, mstring);
if (ptr != NIL && s1 != s) {
/* we have a match, remember the match */
if ( mstring ) {
strncpy (mstring, s, s1 - s);
mstring[s1 - s] = '\0';
}
return (ptr);
} else if (ptr != NIL && (*cs & OPT)) {
/* \p was optional so no match is ok */
return (ptr);
} else if (ptr != NIL) {
/* not optional and we still matched */
return (NIL);
}
if (!isalnum(*s1) && *s1 != '_')
return (NIL);
if (*s1 == '\\')
_escaped = _escaped ? FALSE : TRUE;
else
_escaped = FALSE;
} while (*s1++);
return (NIL);
/* try to match anything */
case 'a':
/*
* This is really the same as trying the match the
* remaining parts of the expression to any subset
* of the string.
*/
s1 = s;
do {
/*
* Hack for an important special case: if the next thing
* in the pattern is a string, just gobble characters until
* we find something that matches that string (this saves
* the cost of a recursive call on expmatch while scanning
* for the start of comments or strings). Since many
* patterns end with a string, we also treat that as a
* special case.
*/
if( *(ptr=MNEXT(cs)) == STR ) {
c = *SSTR(ptr);
while( *s1 && *s1 != c )
s1++;
if ( *s1 == 0 )
return(NIL);
if ( SNEXT(ptr) == 0 && (s1 != s || *cs & OPT)) {
/* next item is a string, it's the last item and
* the \a match is ok - just loop to try & match
* the string.
*/
if ( strtptr )
*strtptr = s1;
cs = ptr;
s = s1;
break;
}
}
ptr = expmatch (s1, MNEXT(cs), strtptr, mstring);
if (ptr != NIL && (s1 != s || *cs & OPT)) {
/* we have a match */
if ( strtptr )
*strtptr = s1;
return(ptr);
} else if (ptr != NIL) {
/* not optional and we still matched */
return (NIL);
}
if (*s1 == '\\')
_escaped = _escaped ? FALSE : TRUE;
else
_escaped = FALSE;
} while (*s1++);
return (NIL);
/* fail if we are currently _escaped */
case 'e':
if (_escaped)
return(NIL);
cs = MNEXT(cs);
break;
/* match any number of tabs and spaces */
case 'd':
ptr = s;
while (*s == ' ' || *s == '\t')
s++;
if (s != ptr || s == _start) {
/* match, be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*s == '\n' || *s == '\0') {
/* match, be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match, error return */
return (NIL);
break;
/* check for end of line */
case '$':
if (*s == '\0' || *s == '\n') {
/* match, be happy */
s++;
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match, error return */
return (NIL);
break;
/* check for start of line */
case '^':
if (s == _start) {
/* match, be happy */
matched = 1;
cs = MNEXT(cs);
} else if (*cs & ALT) {
/* try the next part */
matched = 0;
cs = MNEXT(cs);
} else if (*cs & OPT) {
/* doesn't matter */
matched = 1;
cs = MNEXT(cs);
} else
/* no match, error return */
return (NIL);
break;
/* end of a subexpression, return success */
case ')':
return (s);
}
break;
}
}
return (s);
}